This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
knitr::opts_chunk$set(cache=TRUE)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(faraway) #halfnorm
library(TSA) #eacf
##
## Attaching package: 'TSA'
## The following object is masked from 'package:readr':
##
## spec
## The following objects are masked from 'package:stats':
##
## acf, arima
## The following object is masked from 'package:utils':
##
## tar
library(forecast) #autoarima
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
## method from
## fitted.Arima TSA
## plot.Arima TSA
library(gridExtra) #grid arrange
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
setwd("/Users/Large_Data/Final_Data")
test <- read.csv(file = '202010.csv')
head(test)
glimpse(test)
## Rows: 352,106
## Columns: 110
## $ Year <int> 2020, 2020, 2020, 2020, 2020, 2020, 2…
## $ Quarter <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4…
## $ Month <int> 10, 10, 10, 10, 10, 10, 10, 10, 10, 1…
## $ DayofMonth <int> 8, 9, 11, 12, 13, 14, 15, 16, 18, 19,…
## $ DayOfWeek <int> 4, 5, 7, 1, 2, 3, 4, 5, 7, 1, 2, 3, 4…
## $ FlightDate <chr> "2020-10-08", "2020-10-09", "2020-10-…
## $ Reporting_Airline <chr> "AA", "AA", "AA", "AA", "AA", "AA", "…
## $ DOT_ID_Reporting_Airline <int> 19805, 19805, 19805, 19805, 19805, 19…
## $ IATA_CODE_Reporting_Airline <chr> "AA", "AA", "AA", "AA", "AA", "AA", "…
## $ Tail_Number <chr> "N932AM", "N934AA", "N992AU", "N132AN…
## $ Flight_Number_Reporting_Airline <int> 2259, 2259, 2259, 2259, 2259, 2259, 2…
## $ OriginAirportID <int> 11298, 11298, 11298, 11298, 11298, 11…
## $ OriginAirportSeqID <int> 1129806, 1129806, 1129806, 1129806, 1…
## $ OriginCityMarketID <int> 30194, 30194, 30194, 30194, 30194, 30…
## $ Origin <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "D…
## $ OriginCityName <chr> "Dallas/Fort Worth, TX", "Dallas/Fort…
## $ OriginState <chr> "TX", "TX", "TX", "TX", "TX", "TX", "…
## $ OriginStateFips <int> 48, 48, 48, 48, 48, 48, 48, 48, 48, 4…
## $ OriginStateName <chr> "Texas", "Texas", "Texas", "Texas", "…
## $ OriginWac <int> 74, 74, 74, 74, 74, 74, 74, 74, 74, 7…
## $ DestAirportID <int> 14107, 14107, 14107, 14107, 14107, 14…
## $ DestAirportSeqID <int> 1410702, 1410702, 1410702, 1410702, 1…
## $ DestCityMarketID <int> 30466, 30466, 30466, 30466, 30466, 30…
## $ Dest <chr> "PHX", "PHX", "PHX", "PHX", "PHX", "P…
## $ DestCityName <chr> "Phoenix, AZ", "Phoenix, AZ", "Phoeni…
## $ DestState <chr> "AZ", "AZ", "AZ", "AZ", "AZ", "AZ", "…
## $ DestStateFips <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4…
## $ DestStateName <chr> "Arizona", "Arizona", "Arizona", "Ari…
## $ DestWac <int> 81, 81, 81, 81, 81, 81, 81, 81, 81, 8…
## $ CRSDepTime <int> 1430, 1430, 1430, 1430, 1430, 1430, 1…
## $ DepTime <int> 1432, 1449, 1428, 1432, 1432, 1429, 1…
## $ DepDelay <dbl> 2, 19, -2, 2, 2, -1, -2, 110, 0, 33, …
## $ DepDelayMinutes <dbl> 2, 19, 0, 2, 2, 0, 0, 110, 0, 33, 3, …
## $ DepDel15 <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0…
## $ DepartureDelayGroups <int> 0, 1, -1, 0, 0, -1, -1, 7, 0, 2, 0, 1…
## $ DepTimeBlk <chr> "1400-1459", "1400-1459", "1400-1459"…
## $ TaxiOut <dbl> 15, 20, 13, 20, 18, 16, 20, 17, 12, 1…
## $ WheelsOff <int> 1447, 1509, 1441, 1452, 1450, 1445, 1…
## $ WheelsOn <int> 1447, 1508, 1449, 1452, 1458, 1447, 1…
## $ TaxiIn <dbl> 5, 6, 9, 7, 10, 7, 9, 9, 6, 9, 6, 6, …
## $ CRSArrTime <int> 1505, 1505, 1505, 1505, 1505, 1505, 1…
## $ ArrTime <int> 1452, 1514, 1458, 1459, 1508, 1454, 1…
## $ ArrDelay <dbl> -13, 9, -7, -6, 3, -11, -1, 101, -15,…
## $ ArrDelayMinutes <dbl> 0, 9, 0, 0, 3, 0, 0, 101, 0, 24, 0, 1…
## $ ArrDel15 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0…
## $ ArrivalDelayGroups <int> -1, 0, -1, -1, 0, -1, -1, 6, -1, 1, -…
## $ ArrTimeBlk <chr> "1500-1559", "1500-1559", "1500-1559"…
## $ Cancelled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "…
## $ Diverted <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CRSElapsedTime <dbl> 155, 155, 155, 155, 155, 155, 155, 15…
## $ ActualElapsedTime <dbl> 140, 145, 150, 147, 156, 145, 156, 14…
## $ AirTime <dbl> 120, 119, 128, 120, 128, 122, 127, 12…
## $ Flights <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Distance <dbl> 868, 868, 868, 868, 868, 868, 868, 86…
## $ DistanceGroup <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4…
## $ CarrierDelay <dbl> NA, NA, NA, NA, NA, NA, NA, 101, NA, …
## $ WeatherDelay <dbl> NA, NA, NA, NA, NA, NA, NA, 0, NA, 0,…
## $ NASDelay <dbl> NA, NA, NA, NA, NA, NA, NA, 0, NA, 0,…
## $ SecurityDelay <dbl> NA, NA, NA, NA, NA, NA, NA, 0, NA, 0,…
## $ LateAircraftDelay <dbl> NA, NA, NA, NA, NA, NA, NA, 0, NA, 7,…
## $ FirstDepTime <int> NA, NA, NA, NA, NA, NA, NA, 1427, NA,…
## $ TotalAddGTime <dbl> NA, NA, NA, NA, NA, NA, NA, 35, NA, N…
## $ LongestAddGTime <dbl> NA, NA, NA, NA, NA, NA, NA, 35, NA, N…
## $ DivAirportLandings <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ DivReachedDest <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ DivActualElapsedTime <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ DivArrDelay <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ DivDistance <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div1Airport <chr> "", "", "", "", "", "", "", "", "", "…
## $ Div1AirportID <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div1AirportSeqID <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div1WheelsOn <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div1TotalGTime <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div1LongestGTime <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div1WheelsOff <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div1TailNum <chr> "", "", "", "", "", "", "", "", "", "…
## $ Div2Airport <chr> "", "", "", "", "", "", "", "", "", "…
## $ Div2AirportID <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div2AirportSeqID <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div2WheelsOn <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div2TotalGTime <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div2LongestGTime <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div2WheelsOff <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div2TailNum <chr> "", "", "", "", "", "", "", "", "", "…
## $ Div3Airport <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div3AirportID <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div3AirportSeqID <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div3WheelsOn <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div3TotalGTime <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div3LongestGTime <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div3WheelsOff <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div3TailNum <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div4Airport <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div4AirportID <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div4AirportSeqID <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div4WheelsOn <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div4TotalGTime <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div4LongestGTime <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div4WheelsOff <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div4TailNum <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div5Airport <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div5AirportID <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div5AirportSeqID <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div5WheelsOn <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div5TotalGTime <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div5LongestGTime <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div5WheelsOff <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div5TailNum <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ X <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
Variables of our interest: FlightDate 2018-01-02 IATA_CODE_Reporting_Airline e.g. AA UA Flight_Number_Reporting_Airline 588 Origin JFK Dest SFO DepDelayMinutes <15 -> =0 ArrDelayMinutes Cancelled 0/1 Diverted 0/1
Airports: EWR Airplines: AA Overall Cancellation Delay Total 120 months
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
path = "/Users/Large_Data/Final_Data"
setwd(path)
pattern = '*.csv'
files = list.files(path, pattern, full.names = TRUE)
ap.numflights.df <- data.frame(ORIGIN=c('EWR','JFK'), n=c(0,0))
al.numflights.df <- data.frame(IATA_CODE_REPORTING_AIRLINE=c('AA','UA'), n=c(0,0))
dep.timeblk.numflights.df <- data.frame(DEPTIMEBLK=character(),n=integer())
arr.timeblk.numflights.df <- data.frame(ARRTIMEBLK=character(),n=integer())
num.total <- NULL
num.cancel <- NULL
num.ontime <- NULL #dep delay<=15min
avg.delay <- NULL
iter<-1
for (file in files){
# read one file at a time from files as df
df <- fread(file=file,sep = ",", stringsAsFactors = FALSE, header = TRUE)
names(df) <- toupper(names(df))
if ('FL_DATE' %in% names(df)){
df <- df[,c('OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST',
'DEP_DELAY_NEW', 'DEP_TIME_BLK', 'ARR_DELAY_NEW',
'ARR_TIME_BLK', 'CANCELLED', 'DIVERTED', 'DISTANCE')]
names(df) <- c('IATA_CODE_REPORTING_AIRLINE', 'ORIGIN', 'DEST',
'DEPDELAYMINUTES', 'DEPTIMEBLK', 'ARRDELAYMINUTES',
'ARRTIMEBLK', 'CANCELLED', 'DIVERTED', 'DISTANCE')
}
else
df <- df[,c('IATA_CODE_REPORTING_AIRLINE', 'ORIGIN', 'DEST', 'DEPDELAYMINUTES', 'DEPTIMEBLK', 'ARRDELAYMINUTES', 'ARRTIMEBLK', 'CANCELLED', 'DIVERTED', 'DISTANCE')]
# Overall
num.total<-c(num.total, nrow(df))
num.cancel<-c(num.cancel, nrow(df[df$CANCELLED==1,]))
num.ontime<-c(num.ontime, nrow(df[(df$CANCELLED==0) &
(df$DIVERTED==0) &(df$DEPDELAYMINUTES<=15),]))
avg.delay <-c(avg.delay,mean(df[(df$CANCELLED==0) &
(df$DIVERTED==0) &
(df$DEPDELAYMINUTES>15),]$DEPDELAYMINUTES))
# find major airports and airlines frequence of time
ap.numflights.df<- bind_rows(ap.numflights.df,
count(df,ORIGIN)) %>%
group_by(ORIGIN) %>%
summarise_all(sum)
al.numflights.df<- bind_rows(al.numflights.df,
count(df,IATA_CODE_REPORTING_AIRLINE)) %>%
group_by(IATA_CODE_REPORTING_AIRLINE) %>%
summarise_all(sum)
dep.timeblk.numflights.df<- bind_rows(
dep.timeblk.numflights.df,
count(df,DEPTIMEBLK)) %>%
group_by(DEPTIMEBLK) %>%
summarise_all(sum)
arr.timeblk.numflights.df<- bind_rows(
arr.timeblk.numflights.df,
count(df,ARRTIMEBLK)) %>%
group_by(ARRTIMEBLK) %>%
summarise_all(sum)
if (iter%%10==0)
print(paste(iter,'/',length(files)))
iter<-iter+1
}
## [1] "10 / 120"
## [1] "20 / 120"
## [1] "30 / 120"
## [1] "40 / 120"
## [1] "50 / 120"
## [1] "60 / 120"
## [1] "70 / 120"
## [1] "80 / 120"
## [1] "90 / 120"
## [1] "100 / 120"
## [1] "110 / 120"
## [1] "120 / 120"
Airport departure delay airport arrival delay Identify number of scheduled flights departured in an airport monthly 9.75 yrs 3561 days
ap.numflights<-ap.numflights.df$n/9.75
al.numflights<-al.numflights.df$n/9.75
halfnorm(ap.numflights,labs = ap.numflights.df$ORIGIN,ylab='Num of Flights Annually',cex=0.2, nlab = 5,main='Finding Large Airports')
halfnorm(al.numflights,labs = al.numflights.df$IATA_CODE_REPORTING_AIRLINE,ylab='Num of Flights Annually',cex= 0.2,nlab = 5,main='Finding Large Airlines')
dep.timeblk.numflights.df<-dep.timeblk.numflights.df %>%
rename(TIMEBLK=DEPTIMEBLK) %>%
as.data.frame()
arr.timeblk.numflights.df<-arr.timeblk.numflights.df %>%
rename(TIMEBLK=ARRTIMEBLK) %>%
as.data.frame()
ggplot()+
geom_line(data=dep.timeblk.numflights.df[-1,],aes(x=as.factor(TIMEBLK),y=log10(n),col='DEP',group=1))+
geom_line(data=arr.timeblk.numflights.df[-1,],aes(x=as.factor(TIMEBLK),y=log10(n),col='ARR',group=1))+
theme(axis.text.x = element_text(angle = 45), plot.title = element_text(hjust = 0.5))+
ylab('log10(Num of Flights)')+
xlab('Time Blocks')+
ggtitle('Daily Variation of Num of Flights')
LAX DEN DFW ORD ATL Top 20%(76) Airports takes up more than 87% number of flights top 4.7%(18) takes up 50% of scheduled flights.
low.ap.numflights<-quantile(ap.numflights,0.9)
top_df<-ap.numflights.df[ap.numflights.df$n>=low.ap.numflights,]
sum(top_df$n/9.75)/sum(ap.numflights.df$n/9.75)
## [1] 0.948244
low.al.numflights<-quantile(al.numflights,0.9)
al.top_df<-al.numflights.df[al.numflights.df$n>=low.al.numflights,]
sum(al.top_df$n/9.75)/sum(al.numflights.df$n/9.75)
## [1] 0.9659906
Now let’s focus on these 20% airports. And calculate monthly cancellation rate, ontime rate, average delay min, average taxi out.
Overall cancel rate
cancel.list<-num.cancel/num.total
cancel.series<-ts(cancel.list,start=c(2011,1),frequency=12)
plot(cancel.series,type='o',main='Cancellation Variation',ylab='Cancellation Rate')
ontime.list<-num.ontime/num.total
ontime.series<-ts(ontime.list,start=c(2011,1),frequency=12)
plot(ontime.series,type='o',main='On-Time Rate Variation',ylab='On-Time Rate')
avgdelay.list<-avg.delay
avgdelay.series<-ts(avgdelay.list,start=c(2011,1),frequency=12)
plot(avgdelay.series,type='o',main='Average Delay Variation',ylab='Average Delay Minutes')
What happen in April 2020?? ontime and cancellation is negatively correlated.
before.pandemic.df<-data.frame(time=seq.Date(from = as.Date("2011-01-01"),
to=as.Date("2020-02-01"), by = "month"),
cancel=cancel.list[0:110],
ontime=ontime.list[0:110],
avgdelay=avgdelay.list[0:110])
cancel.plot<-ggplot(before.pandemic.df)+
geom_line(aes(x=time,y=cancel,group=1))+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
plot.title = element_text(hjust = 0.5))+
scale_x_date(date_breaks = "3 months",
date_labels = "%Y %m") +
xlab('')+
ggtitle('Cancellation Rate Jan 2011 to Mar 2020')+
ylab('Cancellation Rate')
ontime.plot<-ggplot(before.pandemic.df)+
geom_line(aes(x=time,y=ontime,group=2))+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
plot.title = element_text(hjust = 0.5))+
scale_x_date(date_breaks = "3 months",
date_labels = "%Y %m") +
ggtitle('On-Time Rate Jan 2011 to Mar 2020')+
xlab('Time')+
ylab('On-Time Rate')
grid.arrange(cancel.plot, ontime.plot, nrow=2)
ggplot(before.pandemic.df)+
geom_line(aes(x=time,y=avgdelay,group=1))+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
plot.title = element_text(hjust = 0.5))+
scale_x_date(date_breaks = "3 months",
date_labels = "%Y %m") +
xlab('Time')+
ggtitle('Average Delay Mins Jan 2011 to Mar 2020')+
ylab('Average Delay Mins')
cancel.series<-ts(cancel.list[0:110],start=c(2011,1),frequency=12)
#plot(part.cancel.series,type='o',main='Cancellation Variation Until Dec 2019',ylab='Cancellation Rate')
ontime.series<-ts(ontime.list[0:110],start=c(2011,1),frequency=12)
#plot(part.ontime.series,type='o',main='On-Time Variation Until Dec 2019',ylab='On-Time Rate')
avgdelay.series<-ts(avgdelay.list[0:110],start=c(2011,1),frequency=12)
#plot(avgdelay.series,type='o',ylab='Average Delay Minutes')
plot(decompose(cancel.series))
plot(decompose(ontime.series))
plot(decompose(avgdelay.series))
Is it stationary? No! seasonal pattern. kind of Invertible?
par(mfrow=c(2,1))
unfreq.log.cancel.series<-log(ts(cancel.series))
plot(unfreq.log.cancel.series)
acf(unfreq.log.cancel.series,lag.max = 50,main='Sample ACF of log Cancellation Rate Series')
pacf(unfreq.log.cancel.series,lag.max = 50,main='Sample PACF of log Cancellation Rate Series')
plot(diff(unfreq.log.cancel.series,12))
acf(diff(unfreq.log.cancel.series,12),lag.max = 50,main='Sample ACF of 1st Differencing log Cancellation Rate Series')
pacf(diff(unfreq.log.cancel.series,12),lag.max = 50,main='Sample PACF of 1st Differencing log Cancellation Rate Series')
auto.arima(cancel.series)
## Series: cancel.series
## ARIMA(2,0,0)(1,0,0)[12] with non-zero mean
##
## Coefficients:
## ar1 ar2 sar1 mean
## 0.5550 -0.1938 0.1681 0.0162
## s.e. 0.0971 0.1033 0.1050 0.0015
##
## sigma^2 estimated as 7.554e-05: log likelihood=367.62
## AIC=-725.24 AICc=-724.67 BIC=-711.74
CR: ARIMA(2,0,0)(1,0,0)[12], From first differencing plots, we have
par(mfrow=c(2,1))
unfreq.log.ontime.series<-log(ts(ontime.series))
plot(unfreq.log.ontime.series)
acf(unfreq.log.ontime.series,lag.max = 50,main='Sample ACF of log On-Time Rate Series')
pacf(unfreq.log.ontime.series,lag.max = 50,main='Sample PACF of log On-Time Rate Series')
#Seasonal differencing
plot(diff(unfreq.log.ontime.series,12))
acf(diff(unfreq.log.ontime.series,12),lag.max = 50,main='Sample ACF of 1st Differencing log On-Time Rate Series')
pacf(diff(unfreq.log.ontime.series,12),lag.max = 50,main='Sample PACF of 1st Differencing log On-Time Rate Series')
plot(diff(diff(unfreq.log.ontime.series,12),12))
acf(diff(diff(unfreq.log.ontime.series,12),12),lag.max = 50,main='Sample ACF of 2nd Differencing log On-Time Rate Series')
pacf(diff(diff(unfreq.log.ontime.series,12),12),lag.max = 50, main='Sample PACF of 2nd Differencing log On-Time Rate Series')
auto.arima(ontime.series)
## Series: ontime.series
## ARIMA(1,0,0)(2,1,0)[12]
##
## Coefficients:
## ar1 sar1 sar2
## 0.5719 -0.5138 -0.5006
## s.e. 0.0851 0.1009 0.0938
##
## sigma^2 estimated as 0.0007657: log likelihood=209.63
## AIC=-411.26 AICc=-410.83 BIC=-400.92
par(mfrow=c(2,1))
unfreq.log.avgdelay.series<-log(ts(avgdelay.series))
plot(unfreq.log.avgdelay.series)
acf(unfreq.log.avgdelay.series,lag.max = 50,main='Sample ACF of log Average Delay Minutes Series')
pacf(unfreq.log.avgdelay.series,lag.max = 50,main='Sample PACF of log Average Delay Minutes Series')
plot(diff(unfreq.log.avgdelay.series,12))
acf(diff(unfreq.log.avgdelay.series,12),lag.max = 50,main='Sample ACF of 1st Differencing log Average Delay Minutes Series')
pacf(diff(unfreq.log.avgdelay.series,12),lag.max = 50,main='Sample PACF of 1st Differencing log Average Delay Minutes Series')
plot(diff(diff(unfreq.log.avgdelay.series),12))
acf(diff(diff(unfreq.log.avgdelay.series),12),lag.max = 50,main='Sample ACF of 2nd Differencing log Average Delay Minutes Series')
pacf(diff(diff(unfreq.log.avgdelay.series),12),lag.max = 50, main='Sample PACF of 2nd Differencing log Average Delay Minutes Series')
auto.arima(avgdelay.series)
## Series: avgdelay.series
## ARIMA(0,1,2)(0,0,2)[12] with drift
##
## Coefficients:
## ma1 ma2 sma1 sma2 drift
## -0.5064 -0.4097 0.2408 0.2088 0.1353
## s.e. 0.0869 0.0870 0.0994 0.0885 0.0409
##
## sigma^2 estimated as 10.36: log likelihood=-281
## AIC=574 AICc=574.82 BIC=590.15
library(tseries)
adf.test(unfreq.log.cancel.series,k=12)
##
## Augmented Dickey-Fuller Test
##
## data: unfreq.log.cancel.series
## Dickey-Fuller = -1.847, Lag order = 12, p-value = 0.6401
## alternative hypothesis: stationary
adf.test(unfreq.log.ontime.series,k=12)
##
## Augmented Dickey-Fuller Test
##
## data: unfreq.log.ontime.series
## Dickey-Fuller = -1.7595, Lag order = 12, p-value = 0.6764
## alternative hypothesis: stationary
adf.test(unfreq.log.avgdelay.series,k=12)
##
## Augmented Dickey-Fuller Test
##
## data: unfreq.log.avgdelay.series
## Dickey-Fuller = -1.8856, Lag order = 12, p-value = 0.6241
## alternative hypothesis: stationary
# aic bic
d<-0
for (p in 0:3)
for (q in 0:4)
if(p+q<6){
cr.model<-arima(cancel.series, order=c(p,d,q), seasonal = list(order = c(1,0,0), period = 12),method = "ML")
print(paste('p=',p,', q=',q,', AIC=',AIC(cr.model),', BIC=',BIC(cr.model)))
}
## [1] "p= 0 , q= 0 , AIC= -700.136451777541 , BIC= -692.035010680164"
## [1] "p= 0 , q= 1 , AIC= -723.825497427258 , BIC= -713.023575964088"
## [1] "p= 0 , q= 2 , AIC= -725.543862439714 , BIC= -712.041460610752"
## [1] "p= 0 , q= 3 , AIC= -724.483953426036 , BIC= -708.281071231281"
## [1] "p= 0 , q= 4 , AIC= -722.739811987439 , BIC= -703.836449426892"
## [1] "p= 1 , q= 0 , AIC= -723.803324817326 , BIC= -713.001403354157"
## [1] "p= 1 , q= 1 , AIC= -724.207330744429 , BIC= -710.704928915466"
## [1] "p= 1 , q= 2 , AIC= -725.912955185278 , BIC= -709.710072990524"
## [1] "p= 1 , q= 3 , AIC= -723.025774095118 , BIC= -704.122411534571"
## [1] "p= 1 , q= 4 , AIC= -722.199986853368 , BIC= -700.596143927028"
## [1] "p= 2 , q= 0 , AIC= -725.243010829426 , BIC= -711.740609000464"
## [1] "p= 2 , q= 1 , AIC= -723.347142494487 , BIC= -707.144260299733"
## [1] "p= 2 , q= 2 , AIC= -724.025030640364 , BIC= -705.121668079817"
## [1] "p= 2 , q= 3 , AIC= -724.947207421689 , BIC= -703.343364495349"
## [1] "p= 3 , q= 0 , AIC= -723.445538144685 , BIC= -707.242655949931"
## [1] "p= 3 , q= 1 , AIC= -722.431283235484 , BIC= -703.527920674937"
## [1] "p= 3 , q= 2 , AIC= -723.887611909099 , BIC= -702.28376898276"
Residual analysis
cancel.model <- arima(cancel.series, order=c(2,0,0),seasonal = list(order = c(1,0,0), period = 12), method="ML")
cancel.residual <- residuals(cancel.model)
plot(cancel.residual, main=expression("Residuals of CR model ARIMA(2,0,0)(1,0,0)[12]"), ylab='Residuals')
abline(h=0)
ontime.model <- arima(ontime.series, order=c(1,0,0),seasonal = list(order = c(2,1,0), period = 12), method="ML")
ontime.residual <- residuals(ontime.model)
plot(ontime.residual, main=expression("Residuals of OTR model ARIMA(1,0,0)(2,1,0)[12]"), ylab='Residuals')
abline(h=0)
avgdelay.model <- arima(avgdelay.series, order=c(0,1,2), seasonal = list(order = c(0,0,2), period = 12), method="ML")
avgdelay.residual <- residuals(avgdelay.model)
plot(avgdelay.residual, main=expression("Residuals of ADM model ARIMA(0,1,2)(0,0,2)[12]"), ylab='Residuals')
abline(h=0)
LB.test(cancel.model)
##
## Box-Ljung test
##
## data: residuals from cancel.model
## X-squared = 7.5826, df = 9, p-value = 0.5767
LB.test(ontime.model)
##
## Box-Ljung test
##
## data: residuals from ontime.model
## X-squared = 12.835, df = 9, p-value = 0.1702
LB.test(avgdelay.model)
##
## Box-Ljung test
##
## data: residuals from avgdelay.model
## X-squared = 3.9267, df = 8, p-value = 0.8637
qqnorm(cancel.residual)
qqline(cancel.residual)
shapiro.test(cancel.residual)
##
## Shapiro-Wilk normality test
##
## data: cancel.residual
## W = 0.90547, p-value = 9.682e-07
shapiro.test(ontime.residual)
##
## Shapiro-Wilk normality test
##
## data: ontime.residual
## W = 0.98201, p-value = 0.1446
shapiro.test(avgdelay.residual)
##
## Shapiro-Wilk normality test
##
## data: avgdelay.residual
## W = 0.98142, p-value = 0.1287
Predict
predict(cancel.model, n.ahead=10)
## $pred
## Mar Apr May Jun Jul Aug
## 2020 0.01295315 0.01710937 0.01745776 0.01732542 0.01685408 0.01632351
## Sep Oct Nov Dec
## 2020 0.01620622 0.01481093 0.01469579 0.01501457
##
## $se
## Mar Apr May Jun Jul Aug
## 2020 0.008531662 0.009757690 0.009806255 0.009813489 0.009821557 0.009822670
## Sep Oct Nov Dec
## 2020 0.009822671 0.009822706 0.009822718 0.009822718
plot(forecast(auto.arima(cancel.series)), main = expression("Forecast ARIMA(2,0,0)(1,0,0)[12]"))
lines(ts(cancel.list[110:120],start=c(2020,2),frequency=12),col='red')
plot(forecast(auto.arima(ontime.series)), main = expression("Forecast ARIMA(1,0,0)(2,1,0)[12]"))
lines(ts(ontime.list[110:120],start=c(2020,2),frequency=12),col='red')
plot(forecast(auto.arima(avgdelay.series)), main = expression("Forecast ARIMA(0,1,2)(0,0,2)[12]"))
lines(ts(avgdelay.list[110:120],start=c(2020,2),frequency=12),col='red')
pandemic series
pand.cancel.series<-ts(cancel.list[110:120],start=c(2020,03),frequency=12)
plot(pand.cancel.series)
pand.ontime.series<-ts(ontime.list[110:120],start=c(2020,03),frequency=12)
plot(pand.ontime.series)
pand.avgdelay.series<-ts(avgdelay.list[91:115],start=c(2018,07),frequency=12)
plot(pand.avgdelay.series)
auto.arima(pand.avgdelay.series)
## Series: pand.avgdelay.series
## ARIMA(1,1,0)(0,1,0)[12]
##
## Coefficients:
## ar1
## -0.5998
## s.e. 0.2498
##
## sigma^2 estimated as 18.56: log likelihood=-34.25
## AIC=72.5 AICc=73.84 BIC=73.47
new.cancel.list<-cancel.list
new.cancel.list[111]<-mean(new.cancel.list[0:110])
new.cancel.list[112]<-mean(new.cancel.list[0:110])
new.cancel.series<-ts(new.cancel.list[0:115],start=c(2011,01),frequency=12)
new.ontime.list<-ontime.list
new.ontime.list[111]<-mean(new.ontime.list[0:110])
new.ontime.list[112]<-mean(new.ontime.list[0:110])
new.ontime.series<-ts(new.ontime.list[0:115],start=c(2011,01),frequency=12)
plot(forecast(auto.arima(new.cancel.series),h=5), main = expression("Forecast ARIMA(2,0,0)(1,0,0)[12]"))
lines(ts(new.cancel.list[110:120],start=c(2020,2),frequency=12),col='red')
plot(forecast(auto.arima(new.ontime.series),h=5), main = expression("Forecast ARIMA(1,0,0)(2,1,0)[12]"))
lines(ts(new.ontime.list[110:120],start=c(2020,2),frequency=12),col='red')
avgdelay.series.extended<-ts(avgdelay.list[0:115],start=c(2011,1),frequency=12)
plot(forecast(auto.arima(pand.avgdelay.series),h=5), main = expression("Forecast ARIMA(1,1,0)(0,1,0)[12]"))
lines(ts(avgdelay.list[110:120],start=c(2020,2),frequency=12),col='red')
Nonstationary Cancellation rate: We can find p is between 0-3 and q is between 0-4. Decide by AIC and BIC find both minimal
Average delay minutes has an upwarding trend. non stationary residual flights are independent ADF unit root of AIC BIC